// AO Bench by OpenCL
// imported by kioku @ System K 2009
// 
// about AO Bench[http://lucille.atso-net.jp/aobench/]
//

//const float hwidth = 256.0f*0.5f;
//const float hheight = 256.0f*0.5f; 
struct Ray
{
	float4 org;
	float4 dir;
};
struct Sphere
{
	float4 center;
	float radius;
};
struct Plane
{
	float4 p;
	float4 n;
};

struct Intersection
{
    float t;
    float4 p;     // hit point
    float4 n;     // normal
    int hit;
};

//struct Sphere sphere[3];
//struct Plane plane;
//int seed = 0;

void inline shpere_intersect(struct Sphere* s, struct Ray* ray, struct Intersection* isect)
{
    float4 rs = ray->org - s->center;
    float B = dot(rs, ray->dir);
    float C = dot(rs, rs) - (s->radius * s->radius);
    float D = B * B - C;

    if (D > 0.0)
    {
		float t = -B - native_sqrt(D);
		if ( (t > 0.0f) && (t < isect->t) )
		{
			isect->t = t;
			isect->hit = 1;

			// calculate normal.
			float4 p = (float4)(ray->org.x + ray->dir.x * t,
						  ray->org.y + ray->dir.y * t,
						  ray->org.z + ray->dir.z * t,
						  0.0f);
			float4 n = p - s->center;
			n = fast_normalize(n);
			isect->n = n;
			isect->p = p;
		}
	}
}

void inline plane_intersect(struct Plane* pl, struct Ray* ray, struct Intersection* isect)
{
  	float d = -dot(pl->p, pl->n);
	float v = dot(ray->dir, pl->n);

	if (fabs(v) < 1.0e-6f)
		return; // the plane is parallel to the ray.

	float t = native_divide(-(dot(ray->org, pl->n) + d),v);

    if ( (t > 0.0) && (t < isect->t) )
    {
		isect->hit = 1;
		isect->t   = t;
		isect->n   = pl->n;

		float4 p = (float4)(ray->org.x + t * ray->dir.x,
					  ray->org.y + t * ray->dir.y,
					  ray->org.z + t * ray->dir.z,
					  0.0f);
		isect->p = p;
	}
}


void inline Intersect(struct Ray* r, struct Intersection* i)
{

	struct Sphere sphere[3]={{{-2.0f, 0.0f, -3.5f, 0.0f},0.5f},{{-0.5f, 0.0f, -3.0f, 0.0f},0.5f},{{1.0f, 0.0f, -2.2f, 0.0f},0.5f}};
	struct Plane plane={{0.0f,-0.5f, 0.0f, 0.0f},{0.0f, 1.0f, 0.0f, 0.0f}};

	shpere_intersect(&sphere[0], r, i);
	shpere_intersect(&sphere[1], r, i);
	shpere_intersect(&sphere[2], r, i);

	plane_intersect(&plane, r, i);
}

void inline orthoBasis(float4 basis[3], float4 n)
{
	//basis[2] = (float4)(n.x, n.y, n.z, 0.0f);
	basis[2].x=n.x;
	basis[2].y=n.y;
	basis[2].z=n.z;
	basis[2].w=0.0f;

	//basis[1] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
	basis[1].x=0.0f;
	basis[1].y=0.0f;
	basis[1].z=0.0f;
	basis[1].w=0.0f;

	if ((n.x < 0.6f) && (n.x > -0.6f))
		basis[1].x = 1.0f;
	else if ((n.y < 0.6) && (n.y > -0.6f))
		basis[1].y = 1.0f;
	else if ((n.z < 0.6f) && (n.z > -0.6f))
		basis[1].z = 1.0f;
	else
		basis[1].x = 1.0f;


	basis[0] = cross(basis[1], basis[2]);
	basis[0] = fast_normalize(basis[0]);

	basis[1] = cross(basis[2], basis[0]);
	basis[1] = fast_normalize(basis[1]);

}

//float random(float* seed)
//{
//	*seed = (int)(fmod((float)(*seed)*1364.0f+626.0f, 509.0f));
//	return (float)(*seed)/509.0f;
//}

const int ntheta = 16;
const int nphi   = 16;
const float eps  = 0.0001f;

float inline computeAO(struct Intersection* isect, float* sd)
{
	int i, j;

    // Slightly move ray org towards ray dir to avoid numerical probrem.
    float4 p = (float4)(isect->p.x + eps * isect->n.x,
                  isect->p.y + eps * isect->n.y,
                  isect->p.z + eps * isect->n.z,
				  0.0f);

    // Calculate orthogonal basis.
    float4 basis[3];
    orthoBasis(basis, isect->n);
	
    float occlusion = 0.0f;

    for (j = 0; j < ntheta; j++)
    {
		for (i = 0; i < nphi; i++)
		{
			// Pick a random ray direction with importance sampling.
			// p = cos(theta) / 3.141592f
			*sd = (int)(fmod((float)(*sd)*1364.0f+626.0f, 509.0f));
			float r = native_divide(*sd,509.0f);    //(float)(seed)/509.0f;//random(seed);
			*sd = (int)(fmod((float)(*sd)*1364.0f+626.0f, 509.0f));
			float phi = native_divide(*sd,509.0f) * 2.0f * 3.141592f;  //2.0f * 3.141592f * random();

			float4 ref;
			ref.x = native_cos(phi) * native_sqrt(1.0f - r);
			ref.y = native_sin(phi) * native_sqrt(1.0f - r);
			ref.z = native_sqrt(r);

			// local -> global
			float4 rray;
			rray.x = ref.x * basis[0].x + ref.y * basis[1].x + ref.z * basis[2].x;
			rray.y = ref.x * basis[0].y + ref.y * basis[1].y + ref.z * basis[2].y;
			rray.z = ref.x * basis[0].z + ref.y * basis[1].z + ref.z * basis[2].z;

			float4 raydir = (float4)(rray.x, rray.y, rray.z, 0.0f);

			struct Ray ray;
			ray.org = p;
			ray.dir = raydir;

			struct Intersection occIsect;
			occIsect.hit = 0;
			occIsect.t = 1.0e+30f;
			occIsect.n = occIsect.p = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
			Intersect(&ray, &occIsect);
			if (occIsect.hit != 0)
				occlusion += 1.0f;

		}
	}

	// [0.0, 1.0]
    occlusion = native_divide(((float)(ntheta * nphi) - occlusion), (float)(ntheta * nphi));
	return occlusion;//(float4)(occlusion, occlusion, occlusion, 1.0f);
}

__kernel void main(
__global uint * out)
{
	
	float hwidth=get_global_size(0)>>1;
	float hheight=get_global_size(1)>>1;
	
	uint nIndex = get_global_id(0) + get_global_id(1) * get_global_size(0);

	struct Intersection i;
	i.hit = 0;
	i.t = 1.0e+30;
	i.n = i.p = (float4)(0, 0, 0, 0);
	
	float px = ((float)(get_global_id(0) ) - hwidth) / hwidth;
	float py = ((float)(get_global_id(1) ) - hheight) / hheight;
	float4 dir = fast_normalize((float4)(px, py, -1.0f, 0.0f));
	float4 org= (float4)(0,0,0,0);
	struct Ray r;
	r.org = org;
	r.dir = dir;
	//seed = (int)(fmod((dir.x+hwidth) * (dir.y+hheight) * 4525434.0f, 65536.0f));
	
	uchar rcol = 0;
	Intersect(&r, &i);
	if (i.hit != 0)
	{
		float s = fmod((dir.x+hwidth) * (dir.y+hheight) * MAXFLOAT, 65536.0f);
		rcol = (uchar)(computeAO(&i, &s) * 255);
		//seed = s;
	}
	
	//float cx = dir.x + 1.0f;
	//float cy = dir.y + 1.0f;
	//out[nIndex] = i.hit;
	//out[nIndex] = (int)(cx*128.0f) + (int)(cy*128.0f)*256;
	out[nIndex] = (uint)(rcol | (rcol<<8) | (rcol<<16) | (255<<24));
}
